This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
options(scipen = 999)
library(tidyverse)
library(ggplot2)
library(plotly)
library(data.table)
library(lubridate)
library(dplyr)
library(corrplot)
library(leaflet)
#Get only the Summary data for some initial analysis
summary_raw_data<-read_csv ('data/claim_summary_v1.csv' )
Parsed with column specification:
cols(
.default = col_double(),
esco_id = [31mcol_character()[39m,
bene_hic_num = [31mcol_character()[39m,
claim_first_dialysis_date = [31mcol_character()[39m,
claim_last_dialysis_date = [31mcol_character()[39m
)
See spec(...) for full column specifications.
summary_wds<-summary_raw_data
#Get the data with all the required fields for analysis
#claim_detail_raw_data<-fread("data/claim_details.csv",sep = "|",fill = T)
#detail_raw_data<-read_csv ('data/claim_details.csv' )
locations<-read_csv('data/ESCo_LOCATIONS_lo.csv' )
Parsed with column specification:
cols(
location_id = [31mcol_character()[39m,
short_name = [31mcol_character()[39m,
location_address_1 = [31mcol_character()[39m,
location_address_2 = [31mcol_character()[39m,
location_city = [31mcol_character()[39m,
location_state = [31mcol_character()[39m,
location_zip_code = [32mcol_double()[39m,
latitude = [32mcol_double()[39m,
longitude = [32mcol_double()[39m
)
INITIAL ANALYSIS WITH THE SUMMARY DATA ONLY.
summary_wds %>%
select (-esco_id,-bene_hic_num,-esco_aligned_flag) %>%
filter(patient_id == '811454') %>%
arrange(dos_year,dos_month)
NA
str(summary_wds)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 10364 obs. of 35 variables:
$ esco_id : chr "E0050" "E0050" "E0050" "E0050" ...
$ bene_hic_num : chr "161446232A" "161446232A" "161446232A" "161480276A" ...
$ patient_id : num 811454 811454 811454 814249 814249 ...
$ dos_year : num 2018 2018 2018 2017 2017 ...
$ dos_month : num 1 8 11 2 5 1 4 4 2 8 ...
$ claim_first_dialysis_date: chr "2018-01-03 00:00:00.000" "2018-08-01 00:00:00.000" "2018-11-02 00:00:00.000" "2017-02-01 00:00:00.000" ...
$ claim_last_dialysis_date : chr "2018-01-31 00:00:00.000" "2018-08-31 00:00:00.000" "2018-11-30 00:00:00.000" "2017-02-06 00:00:00.000" ...
$ dci_claims : num 1 0 0 1 1 1 1 1 1 0 ...
$ non_dci_claims : num 3 7 11 3 1 1 6 67 1 3 ...
$ payment : num 2512 2853 3498 2272 2617 ...
$ part_a : num 2053 2323 2160 2068 2432 ...
$ part_b_phys : num 459 530 1300 204 185 ...
$ part_b_dme : num 0 0 39.1 0 0 ...
$ inpatient : num 0 0 0 0 0 ...
$ outpatient_dialysis : num 2053 2323 2160 2068 2432 ...
$ outpatient_er : num 0 0 0 0 0 ...
$ outpatient_other : num 0 0 0 0 0 ...
$ hha : num 0 0 0 0 0 0 0 0 0 0 ...
$ snf : num 0 0 0 0 0 ...
$ hospice : num 0 0 0 0 0 0 0 0 0 0 ...
$ full_encounter : num 0 0 0 0 0 0 0 0 0 0 ...
$ phys_neph : num 222 222 222 185 185 ...
$ phys_hosp : num 0 0 0 0 0 ...
$ phys_ed : num 0 0 0 0 0 ...
$ vasc_access : num 0 0 916 0 0 ...
$ ambulance : num 0 0 0 0 0 ...
$ phys_other : num 236.9 307.9 162.1 19.2 0 ...
$ dme : num 0 0 39.1 0 0 ...
$ esco_aligned_flag : num 1 1 1 1 1 1 1 1 1 1 ...
$ inpatient_fluid : num 0 0 0 0 0 0 0 0 0 0 ...
$ outpatient_er_fluid : num 0 0 0 0 0 0 0 0 0 0 ...
$ part_a_other_fluid : num 0 0 0 0 0 0 0 0 0 0 ...
$ inpatient_access : num 0 0 0 0 0 0 0 0 0 0 ...
$ outpatient_er_access : num 0 0 0 0 0 0 0 0 0 0 ...
$ part_a_other_access : num 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "spec")=
.. cols(
.. esco_id = [31mcol_character()[39m,
.. bene_hic_num = [31mcol_character()[39m,
.. patient_id = [32mcol_double()[39m,
.. dos_year = [32mcol_double()[39m,
.. dos_month = [32mcol_double()[39m,
.. claim_first_dialysis_date = [31mcol_character()[39m,
.. claim_last_dialysis_date = [31mcol_character()[39m,
.. dci_claims = [32mcol_double()[39m,
.. non_dci_claims = [32mcol_double()[39m,
.. payment = [32mcol_double()[39m,
.. part_a = [32mcol_double()[39m,
.. part_b_phys = [32mcol_double()[39m,
.. part_b_dme = [32mcol_double()[39m,
.. inpatient = [32mcol_double()[39m,
.. outpatient_dialysis = [32mcol_double()[39m,
.. outpatient_er = [32mcol_double()[39m,
.. outpatient_other = [32mcol_double()[39m,
.. hha = [32mcol_double()[39m,
.. snf = [32mcol_double()[39m,
.. hospice = [32mcol_double()[39m,
.. full_encounter = [32mcol_double()[39m,
.. phys_neph = [32mcol_double()[39m,
.. phys_hosp = [32mcol_double()[39m,
.. phys_ed = [32mcol_double()[39m,
.. vasc_access = [32mcol_double()[39m,
.. ambulance = [32mcol_double()[39m,
.. phys_other = [32mcol_double()[39m,
.. dme = [32mcol_double()[39m,
.. esco_aligned_flag = [32mcol_double()[39m,
.. inpatient_fluid = [32mcol_double()[39m,
.. outpatient_er_fluid = [32mcol_double()[39m,
.. part_a_other_fluid = [32mcol_double()[39m,
.. inpatient_access = [32mcol_double()[39m,
.. outpatient_er_access = [32mcol_double()[39m,
.. part_a_other_access = [32mcol_double()[39m
.. )
summary(summary_wds)
esco_id bene_hic_num patient_id dos_year dos_month claim_first_dialysis_date
Length:10364 Length:10364 Min. : 30111 Min. :2017 Min. : 1.000 Length:10364
Class :character Class :character 1st Qu.:798197 1st Qu.:2017 1st Qu.: 4.000 Class :character
Mode :character Mode :character Median :806827 Median :2018 Median : 7.000 Mode :character
Mean :711615 Mean :2018 Mean : 6.521
3rd Qu.:812964 3rd Qu.:2018 3rd Qu.: 9.000
Max. :910902 Max. :2018 Max. :12.000
claim_last_dialysis_date dci_claims non_dci_claims payment part_a part_b_phys
Length:10364 Min. :0.0000 Min. : 0.0 Min. : 34.18 Min. : 13.17 Min. : 0.0
Class :character 1st Qu.:1.0000 1st Qu.: 3.0 1st Qu.: 2664.22 1st Qu.: 2321.80 1st Qu.: 224.9
Mode :character Median :1.0000 Median : 6.0 Median : 3284.65 Median : 2730.59 Median : 369.4
Mean :0.9818 Mean : 11.7 Mean : 6916.75 Mean : 5898.76 Mean : 957.7
3rd Qu.:1.0000 3rd Qu.: 14.0 3rd Qu.: 6039.45 3rd Qu.: 4754.09 3rd Qu.: 1002.8
Max. :5.0000 Max. :138.0 Max. :139204.35 Max. :131619.23 Max. :22145.9
part_b_dme inpatient outpatient_dialysis outpatient_er outpatient_other hha snf
Min. : 0.00 Min. : 0 Min. : 0 Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
1st Qu.: 0.00 1st Qu.: 0 1st Qu.:2037 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
Median : 0.00 Median : 0 Median :2323 Median : 0.0 Median : 0.0 Median : 0.0 Median : 0.0
Mean : 60.23 Mean : 2601 Mean :2299 Mean : 106.7 Mean : 391.9 Mean : 160.5 Mean : 323.2
3rd Qu.: 4.33 3rd Qu.: 0 3rd Qu.:2609 3rd Qu.: 0.0 3rd Qu.: 115.2 3rd Qu.: 0.0 3rd Qu.: 0.0
Max. :17581.02 Max. :131619 Max. :7283 Max. :26135.5 Max. :25749.6 Max. :9011.7 Max. :22185.7
hospice full_encounter phys_neph phys_hosp phys_ed vasc_access ambulance
Min. : 0.00 Min. :0 Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0 Min. : 0.0
1st Qu.: 0.00 1st Qu.:0 1st Qu.:180.8 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 0 1st Qu.: 0.0
Median : 0.00 Median :0 Median :213.8 Median : 0.0 Median : 0.00 Median : 0 Median : 0.0
Mean : 16.37 Mean :0 Mean :189.0 Mean : 259.2 Mean : 28.58 Mean : 75 Mean : 100.7
3rd Qu.: 0.00 3rd Qu.:0 3rd Qu.:220.4 3rd Qu.: 0.0 3rd Qu.: 0.00 3rd Qu.: 0 3rd Qu.: 0.0
Max. :7032.21 Max. :0 Max. :877.3 Max. :10036.4 Max. :1007.74 Max. :5523 Max. :8815.2
phys_other dme esco_aligned_flag inpatient_fluid outpatient_er_fluid part_a_other_fluid inpatient_access
Min. : 0.00 Min. : 0.00 Min. :1 Min. : 0.0 Min. : 0.000 Min. : 0.00 Min. : 0
1st Qu.: 17.26 1st Qu.: 0.00 1st Qu.:1 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0
Median : 111.42 Median : 0.00 Median :1 Median : 0.0 Median : 0.000 Median : 0.00 Median : 0
Mean : 305.26 Mean : 60.23 Mean :1 Mean : 197.4 Mean : 7.179 Mean : 36.26 Mean : 281
3rd Qu.: 309.44 3rd Qu.: 4.33 3rd Qu.:1 3rd Qu.: 0.0 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0
Max. :16197.08 Max. :17581.02 Max. :1 Max. :42387.0 Max. :4956.360 Max. :14684.18 Max. :92109
outpatient_er_access part_a_other_access
Min. : 0.00 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0.00
Median : 0.00 Median : 0.00
Mean : 14.89 Mean : 92.67
3rd Qu.: 0.00 3rd Qu.: 0.00
Max. :11694.69 Max. :15409.49
summary(summary_wds$payment)
Min. 1st Qu. Median Mean 3rd Qu. Max.
34.18 2664.22 3284.65 6916.75 6039.45 139204.35
summary_wds %>%
ggplot(aes(x=payment)) +
geom_histogram(breaks = seq(500,100000,by=1000),
bins=20,
col="red",
fill = "green",
alpha = 0.2) +
scale_x_log10() +
labs(x="Payments", y = "Counts",title = "Payments Histogram")
change_increase<-summary_wds %>%
select (patient_id,payment,dos_year,dos_month) %>%
group_by(patient_id,dos_year) %>%
summarise(sum_payment = sum(payment), num_of_months = NROW(dos_month)) %>%
ungroup %>%
filter(num_of_months >11) %>%
arrange (num_of_months,patient_id,dos_year,desc(sum_payment))
change_increase$patient_id <- as.factor(change_increase$patient_id)
change_increase %>%
filter (patient_id == '403675')
change_increase$patient_id <- as.factor(change_increase$patient_id )
change_increase$dos_year <- as.factor(change_increase$dos_year)
pl <- ggplot( change_increase,aes(y=sum_payment, x = patient_id ,fill=dos_year)) +
geom_bar(stat = "identity",position = 'dodge') +
labs(x="patient_id", y= "Payments") +
ggtitle("yearly payments difference for patients")
pl
#pivot the data and understand hte percentage change in payments and get expensive patients
expensive_patients<-pivot_wider(change_increase,
names_from = dos_year,
values_from = sum_payment,
values_fill = list(sum_payment = 0)) %>%
mutate(percent_change = (`2018`-`2017`)/`2017` * 100) %>%
filter(percent_change > 50 & `2017` != 0) %>%
arrange (desc(percent_change))
expensive_patients
summary_wds %>%
filter(patient_id =='798094') %>%
arrange (dos_year,dos_month)
NA
year_plot<- summary_wds %>%
group_by(dos_year,dos_month) %>%
summarise(sum_pay = sum(payment)) %>%
ungroup()
year_plot
year_plot$dos_month <- as.factor(year_plot$dos_month)
year_plot$dos_year <- as.factor(year_plot$dos_year)
pl <- ggplot( year_plot,aes(y=sum_pay, x = dos_month ,fill=dos_year)) +
geom_bar(stat = "identity",position = 'dodge') +
labs(x="Months", y= "Payments") +
ggtitle("payments increase every year")
pl
year_plot
NA
#redo this with non summarize raw data
plbox <- ggplot(year_plot,aes(y=sum_pay, x = dos_month) )+
geom_boxplot() +
labs(x="Months", y= "Payments") +
ggtitle("T")
plbox
NA
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
summary_2017<-summary_wds %>%
group_by(dos_month,dos_year) %>%
summarise(
total_patients = NROW(patient_id),
totalpayments = sum(payment),
min_pay = min(payment),
max_pay = max(payment),
avg_pay = sum(payment)/NROW(patient_id)) %>%
ungroup() %>%
arrange(dos_year,dos_month) %>%
filter(dos_year =="2017")
summary_2017
summary_2017_pivot <- summary_2017 %>%
pivot_longer(min_pay:avg_pay, names_to = "payment_type", values_to = "payment")
summary_2017_pivot$dos_month = as.factor(summary_2017_pivot$dos_month)
summary_2017_pivot
NA
pl<- ggplot(summary_2017_pivot,aes(x=dos_month, y = payment, fill = payment_type)) +
geom_col(stat = "identity",position = 'dodge') +
geom_hline(yintercept = 6000, linetype = "dashed",color = "darkred") +
labs(x="Months", y = "Payments") +
ggtitle("2017 minimum, maximum and average payments")
Ignoring unknown parameters: stat
pl + scale_y_continuous(breaks = seq(0,130000,5000))
NA
NA
all_payments<-summary_wds %>%
group_by(dos_month) %>%
summarise(
total_patients = NROW(patient_id),
totalpayments = sum(payment),
min_pay = min(payment),
max_pay = max(payment),
avg_pay = sum(payment)/NROW(patient_id)) %>%
ungroup() %>%
pivot_longer(min_pay:avg_pay, names_to='payment_type',values_to = 'payments')
all_payments
#redo with box plot
pl_all<- ggplot(all_payments,aes(x=dos_month, y = payments, fill = payment_type)) +
geom_col(stat = "identity",position = 'dodge') +
geom_hline(yintercept = 6000, linetype = "dashed",color = "darkred") +
labs(x="Months", y = "Payments") +
ggtitle("minimum, maximum and average payments")
Ignoring unknown parameters: stat
pl_all + scale_y_continuous(breaks = seq(0,130000,5000))
NA
NA
ANALYSIS USING SUMMARY AND DETAIL DATA Get the data and clean it
#get detail and summary data and save it in a df
detail_raw_data<-read_csv ('data/claim_details.csv' )
Duplicated column names deduplicated: 'patient_id' => 'patient_id_1' [37]Parsed with column specification:
cols(
.default = col_double(),
esco_id = [31mcol_character()[39m,
bene_hic_num = [31mcol_character()[39m,
claim_first_dialysis_date = [31mcol_character()[39m,
claim_last_dialysis_date = [31mcol_character()[39m,
location_id = [31mcol_character()[39m,
start_date = [34mcol_datetime(format = "")[39m,
end_date = [34mcol_datetime(format = "")[39m,
esrd_date = [34mcol_datetime(format = "")[39m,
modality = [31mcol_character()[39m,
hgb_cutoff = [31mcol_character()[39m,
epo_ceiling = [31mcol_character()[39m,
route_freq = [31mcol_character()[39m,
drug_name = [31mcol_character()[39m,
ferritin_cutoff = [31mcol_character()[39m,
tsat_cutoff = [31mcol_character()[39m,
sup_name = [31mcol_character()[39m,
tx_epo = [31mcol_character()[39m,
venofer_given = [31mcol_character()[39m,
venofer_wasted = [31mcol_character()[39m,
ferrlecit_given = [31mcol_character()[39m
# ... with 42 more columns
)
See spec(...) for full column specifications.
1 parsing failure.
row col expected actual file
7272 esrd_date date like NULL 'data/claim_details.csv'
dci_data<-detail_raw_data
dci_data<-dci_data %>%
select(-esco_id,-bene_hic_num,-full_encounter,-phys_neph,-phys_hosp,-phys_ed,-ambulance,-phys_other,-esco_aligned_flag,-inpatient_fluid,-outpatient_er_fluid,-part_a_other_fluid,-patient_id_1,-dos_yyyy,-dos_mm,-esrd_date,-hgb_date,-tsat_date,-ferr_date,-albumin_date,-pth_date,-ca_date,-cca_date,-ph_date,-k_date,-urr_date,-ktv_date,-epo_given,-venofer_wasted,-inpatient_access,-outpatient_er_access,-part_a_other_access,-dci_claims,-non_dci_claims,-hha,-hospice,-dme,-start_date,-end_date,-epo_ceiling,-route_freq,-drug_name,-ferritin_cutoff,-tsat_cutoff,-hgb_cutoff,-ferrlecit_wasted,-zemplar_iv_wasted,-calcijex_iv_wasted,-feraheme_wasted,-hectorol_iv_wasted,-tx_missed,-sensipar_dispensed)
#make sure all the columns are in correct data types.
#change claims date columns to date datatype
date_columns <- c("claim_first_dialysis_date","claim_last_dialysis_date")
dci_data[date_columns] <- lapply(dci_data[date_columns],as.Date)
#change the other columns to factor
fac_columns <- c("patient_id","dos_year","location_id","dos_month","modality","tx","tx_epo","sup_name")
dci_data[fac_columns] <- lapply(dci_data[fac_columns],as.factor)
#change the below columns to logical
bool_columns <- c("epo_protocol_flag","iron_protocol_flag","nutsup_protocol_flag","hgb_exclude_flag","active_flag")
dci_data[bool_columns] <- lapply(dci_data[bool_columns],as.logical)
#Make below colums as logical true if they have any value else if they have null make it logical false.
dci_data<-dci_data %>%
mutate_at(vars("ferrlecit_given","feraheme_given","venofer_given","zemplar_iv_given","hectorol_iv_given","calcijex_iv_given","zemplar_or_given","hectorol_or_given","calcijex_iv_given","zemplar_or_given","hectorol_or_given","calcijex_or_given","activase_given","prostat_given","nepro_given","liquacel_given","has_catheter","aranesp_given","protinex_given","mircera_given","sensipar_given","parsabiv_given","protein_bar_given",
"ice_cream_given","gelatein_given"),
funs(case_when(.=="NULL" ~ FALSE,
TRUE ~ TRUE)))
#FIRST make the null VALUES in char col to zero's except for date columns
num_columns <- c("hgb","tsat","ferr","albumin","pth","ca","cca","ph","k","urr","ktv","tx_epo")
dci_data[num_columns]<-dci_data[num_columns]%>%
replace(.=="NULL","0")
#change all the character columns ot numeric
dci_data<-dci_data %>% mutate_if(is.character,as.numeric)
#names(dci_data)
#as for date columns we cannot replace na values ot 0. first make them character columns and then make rest to the na values to 0 in entire dataframe
dci_data$claim_first_dialysis_date <-as.character.Date(dci_data$claim_first_dialysis_date )
dci_data$claim_last_dialysis_date <-as.character.Date(dci_data$claim_last_dialysis_date )
#make all na values to 0 in entire dataframe
dci_data[is.na(dci_data)]<-0
# sum(is.na(dci_data))
#making the dates column back to date datatype
dci_data$claim_first_dialysis_date <- as.Date(dci_data$claim_first_dialysis_date)
dci_data$claim_last_dialysis_date <- as.Date(dci_data$claim_last_dialysis_date)
#delete the outliers( payments which are less than 1500)
dci_data<-dci_data %>%
filter(`payment`>1500) %>%
arrange(desc(payment))
#combine 2 separate part_b payments to one
dci_data<-dci_data %>%
mutate(part_b = part_b_phys + part_b_dme)
dci_data %>%
select(payment,part_a,part_b_phys,part_b_dme,part_b)
#sum(is.na(dci_data$claim_last_dialysis_date))
sum(is.na(dci_data))
[1] 220
str(dci_data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 9970 obs. of 72 variables:
$ patient_id : Factor w/ 672 levels "30111","39732",..: 293 50 64 251 21 31 293 86 441 172 ...
$ dos_year : Factor w/ 2 levels "2017","2018": 1 1 2 1 2 2 1 2 2 2 ...
$ dos_month : Factor w/ 12 levels "1","2","3","4",..: 7 7 4 7 3 7 9 2 3 8 ...
$ claim_first_dialysis_date: Date, format: NA NA NA "2017-07-13" ...
$ claim_last_dialysis_date : Date, format: NA NA NA "2017-07-29" ...
$ payment : num 128637 122674 121244 112103 109108 ...
$ part_a : num 119013 119153 111345 110036 102586 ...
$ part_b_phys : num 9623 3521 9899 2067 6521 ...
$ part_b_dme : num 0 0 0 0 0 ...
$ inpatient : num 119013 119153 111345 108511 102586 ...
$ outpatient_dialysis : num 0 0 0 1525 0 ...
$ outpatient_er : num 0 0 0 0 0 ...
$ outpatient_other : num 0 0 0 0 0 ...
$ snf : num 0 0 0 0 0 ...
$ vasc_access : num 179 0 309 0 0 ...
$ location_id : Factor w/ 25 levels "000026","000055",..: 18 22 17 24 25 3 18 1 4 5 ...
$ modality : Factor w/ 3 levels "HH","HIC","PD": 2 2 2 2 2 2 2 2 2 2 ...
$ epo_protocol_flag : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ iron_protocol_flag : logi FALSE TRUE TRUE TRUE TRUE TRUE ...
$ nutsup_protocol_flag : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ sup_name : Factor w/ 11 levels "Body Quest Ice Cream",..: 4 5 9 4 4 11 4 11 11 1 ...
$ tx : Factor w/ 24 levels "0","1","2","3",..: 9 1 5 1 1 3 1 1 6 1 ...
$ tx_epo : Factor w/ 16 levels "0","1","10","11",..: 9 1 2 1 1 8 1 1 2 1 ...
$ venofer_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ferrlecit_given : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
$ feraheme_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ zemplar_iv_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ hectorol_iv_given : logi TRUE FALSE TRUE FALSE FALSE TRUE ...
$ calcijex_iv_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ zemplar_or_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ hectorol_or_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ calcijex_or_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ activase_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ prostat_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ nepro_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ liquacel_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ hgb : num 11.1 0 8.6 0 0 10.4 0 0 0 0 ...
$ tsat : num 23 39 38 23 32 56 0 0 18 50 ...
$ ferr : num 2051 1401 1591 1037 667 ...
$ albumin : num 4 0 0 0 0 3.5 0 0 0 0 ...
$ pth : num 559 577 224 0 641 ...
$ ca : num 8.7 0 0 0 0 8.7 0 0 0 0 ...
$ cca : num 8.7 0 0 0 0 9.1 0 0 0 0 ...
$ ph : num 4.2 0 0 0 0 5 0 0 0 0 ...
$ k : num 4.9 0 0 0 0 5.1 0 0 0 0 ...
$ urr : num 72 0 70 0 0 78 0 0 0 0 ...
$ ktv : num 1.45 0 1.35 0 0 1.72 0 0 0 0 ...
$ has_catheter : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ aranesp_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ protinex_given : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ hgb_exclude_flag : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ mircera_given : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ sensipar_given : logi FALSE FALSE TRUE FALSE TRUE TRUE ...
$ parsabiv_given : logi FALSE FALSE TRUE FALSE TRUE TRUE ...
$ protein_bar_given : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
$ ice_cream_given : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
$ gelatein_given : logi FALSE FALSE FALSE FALSE FALSE TRUE ...
$ active_flag : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ hospital_episodes : num 2 1 2 2 1 3 1 0 1 1 ...
$ hospital_admits : num 2 0 2 1 0 3 0 0 1 0 ...
$ hospital_discharges : num 1 0 1 1 0 3 0 0 0 0 ...
$ er_visits : num 0 0 0 0 0 0 0 0 1 0 ...
$ snf_episodes : num 0 0 0 0 0 1 0 1 1 0 ...
$ snf_admit : num 0 0 0 0 0 0 0 0 0 0 ...
$ snf_discharge : num 0 0 0 0 0 1 0 0 1 0 ...
$ med_orders : num 8 16 10 7 12 46 8 9 20 16 ...
$ cardio : num 2 5 2 2 3 8 2 2 1 3 ...
$ beta_blockers : num 1 1 0 1 1 1 1 1 1 1 ...
$ antihypertensives : num 0 1 0 0 0 3 0 0 0 0 ...
$ opioids : num 1 0 0 0 0 1 1 1 1 0 ...
$ antidiabetics : num 0 1 3 1 1 4 0 0 1 3 ...
$ part_b : num 9623 3521 9899 2067 6521 ...
copy the cleaned dataset into another dataset just as a back up
dci_data_ws<- dci_data
saveRDS(dci_data_ws,file="dci_data.rds")
Histograms to understand the distribution of the data for payments
dci_data_ws %>%
select (payment,part_a,part_b_phys,part_b_phys) %>%
ggplot(aes(x=payment) )+
geom_histogram(color="#e9ecef", alpha=0.6, position = 'identity', bins=40) +
scale_x_log10()+
scale_fill_manual(values=c("#69b3a2", "#404080")) +
labs(x="loggged payment values",
y="Counts",
title = "Payment distribution")
#make this interactive
p<- ggplot(dci_data_ws,aes(x=payment,y=..density..)) +
geom_histogram(bins=40,binwidth = 0.05,fill = "black",color="black",alpha=0.2) +
scale_x_log10()+
geom_density(color = "red")+
labs(x="Logged payments",
y="Frequency",
title = "Total Payments distribution")
ggplotly(p)
NA
NA
NA
#make this interactive
p<- ggplot(dci_data_ws,aes(x=payment,fill=modality)) +
geom_histogram(bins=10,binwidth = 0.25,alpha=0.8,position = "dodge") +
scale_x_log10()+
labs(x="loggged payment values",
y="Frequency",
title = "Payment distribution")
p
Modality distributions
#density distribution for modalities
ggplot(dci_data_ws,aes(payment, color = modality,fill=modality)) +
scale_x_log10()+
geom_density(alpha = 0.8,position = "dodge")
dci_data_ws %>%
select(payment,modality) %>%
group_by(modality) %>%
summarise(total_payments=sum(payment),num_of_rows=n())
#Function to create an histgram for each kind of madality HIC,HH AND PD
make_plot <- function(mod){
dci_data_ws %>%
select(payment,modality) %>%
filter(modality == mod) %>%
ggplot(aes(x=payment)) +
geom_histogram(binwith=1000) +
scale_x_log10() +
labs(x="payments",
y="frequency",
title = paste("Distribution of",mod))
}
make_plot('HH')
Ignoring unknown parameters: binwith
make_plot('HIC')
Ignoring unknown parameters: binwith
make_plot('PD')
Ignoring unknown parameters: binwith
dci_data_ws %>%
select(payment,modality) %>%
filter(modality == 'HH') %>%
ggplot(aes(x=payment)) +
geom_histogram() +
labs(x="payments",
y="frequency",
title = "Distribution of Hemo Home Payments")
NA
dci_data_ws %>%
select(payment,modality) %>%
#filter(payment>2800 & payment <3500) %>%
group_by(modality)
NA
NA
NA
NA
dci_data_ws %>%
select(payment,modality) %>%
filter(payment>2800 & payment <3500) %>%
#group_by(modality) %>%
filter(modality=='PD') %>%
ggplot(aes(x=payment)) +
geom_histogram(binwidth = 4000)
dci_data_ws %>%
select(payment,modality) %>%
filter(payment>2800 & payment <3500)
library(reshape2)
dci_long <- reshape2::melt(dci_data_ws)
Using patient_id, dos_year, dos_month, location_id, modality, epo_protocol_flag, iron_protocol_flag, nutsup_protocol_flag, sup_name, tx, tx_epo, venofer_given, ferrlecit_given, feraheme_given, zemplar_iv_given, hectorol_iv_given, calcijex_iv_given, zemplar_or_given, hectorol_or_given, calcijex_or_given, activase_given, prostat_given, nepro_given, liquacel_given, has_catheter, aranesp_given, protinex_given, hgb_exclude_flag, mircera_given, sensipar_given, parsabiv_given, protein_bar_given, ice_cream_given, gelatein_given, active_flag as id variables
attributes are not identical across measure variables; they will be dropped
ggplot(dci_long, aes(value)) + facet_wrap(~variable, scales = 'free_x') +
geom_histogram()
NA
NA
#frequency distribution with density lines
#x<- sample(0:30, 200, replace=T, prob=15 - abs(15 - 0:30))
x<-dci_data_ws$payment
## Calculate and plot the two histograms
hcum <- h <- hist(x, plot=FALSE)
hcum$counts <- cumsum(hcum$counts)
plot(hcum, main="")
plot(h, add=T, col="grey")
## Plot the density and cumulative density
d <- density(x)
lines(x = d$x, y = d$y * length(x) * diff(h$breaks)[1], lwd = 2)
lines(x = d$x, y = cumsum(d$y)/max(cumsum(d$y)) * length(x), lwd = 2)
#distribution by payment type
payment_type_dist<-dci_data_ws %>%
select(dos_year,dos_month,payment,part_a,part_b_dme,part_b_phys,part_b)%>%
pivot_longer(payment:part_b,names_to="payment_type",values_to = "payment")
saveRDS(payment_type_dist,file="payment_type_dist.rds")
#plot the fequency distribution across different payment types
pl<-dci_data_ws %>%
select(dos_year,dos_month,location_id,payment,part_a,part_b_dme,part_b_phys,part_b)%>%
pivot_longer(payment:part_b,names_to="payment_type",values_to = "payment") %>%
filter(dos_year==2017 & dos_month==1) %>%
ggplot( aes(x=payment,fill=payment_type,color= payment_type)) +
geom_histogram(bins= 40,binwidth = 0.05,alpha=0.3,position = "identity") +
scale_x_log10() +
geom_vline(aes(xintercept = mean(payment,na.rm=T)),color = "red", linetype = "dashed",size = 1) +
labs(x="Logged payments",y="Frequency",title = "distribution of payments across payment sources")
ggplotly(pl)
Transformation introduced infinite values in continuous x-axisRemoved 286 rows containing non-finite values (stat_bin).
sum(is.na(dci_data$payment))
[1] 0
#fat tailed dis.
box_pl<-payment_type_dist %>%
filter(dos_year == 2017 & dos_month == 1) %>%
ggplot(aes(y = payment,x = payment_type,fill=payment_type)) +
geom_boxplot()+
# scale_y_log10() +
theme_classic() +
labs(x="Payment type", y = "Payment",
title = "Payment distribution by payment types")
ggplotly(box_pl)
years_plot<- dci_data_ws %>%
group_by(dos_year,dos_month) %>%
summarise(sum_pay = sum(payment)) %>%
ungroup()
pl <- ggplot( years_plot,aes(y=sum_pay, x = dos_month ,fill=dos_year)) +
geom_bar(stat = "identity",position = 'dodge') +
labs(x="Months", y= "Payments") +
ggtitle("Total payments increase in each month")
pl
years_plot
NA
dci_data_ws %>%
group_by(modality) %>%
summarise(payments = sum(payment)) %>%
ggplot(aes(x=modality,y=payments,fill = `payments`)) +
geom_bar(stat="identity" )
NA
NA
#show how modalities are doing by total payments
modality_pl<-dci_data_ws %>%
group_by(dos_year,modality) %>%
summarise(payments = sum(payment)) %>%
ungroup() %>%
ggplot(aes(x=dos_year,y=payments,fill=modality,color=modality))+
geom_bar(stat="identity",position="dodge") +
geom_text(aes(label = round(payments),vjust=0),postion=position_dodge(width=5)) +
labs(x="Modalities in 2017 and 2018",y="payments",title="Payments by modality")
Ignoring unknown parameters: postion
modality_pl
dci_data_ws %>%
ggplot(aes(x=dos_year,y=payment,color = modality)) +
geom_boxplot() +
scale_y_log10() +
labs(x = "Years",y="payments",
title = "Payments in in 2017 and 2018")
correlational plot
num_columns <- c("hgb","tsat","ferr","albumin","pth","ca","cca","ph","k","urr","ktv")
corrs<-dci_data_ws %>% select(num_columns) %>%
# drop_na_() %>%
cor()
corrs
hgb tsat ferr albumin pth ca cca ph k urr
hgb 1.00000000 0.0527380941 0.02570081 0.59628329 0.045803409 0.58340275 0.56441343 0.3009506342 0.4724638 0.21430294
tsat 0.05273809 1.0000000000 0.39986167 0.08686222 -0.027163707 0.04433107 0.03910477 0.0002180982 0.0788412 0.06797837
ferr 0.02570081 0.3998616659 1.00000000 0.05132422 0.005239110 0.05485813 0.05667298 -0.0513212701 0.1183593 0.20079170
albumin 0.59628329 0.0868622245 0.05132422 1.00000000 0.104443153 0.76304243 0.67544496 0.3994618231 0.6250713 0.31241463
pth 0.04580341 -0.0271637069 0.00523911 0.10444315 1.000000000 0.03478453 0.01573087 0.2973311051 0.1093904 0.02051048
ca 0.58340275 0.0443310656 0.05485813 0.76304243 0.034784525 1.00000000 0.98442825 0.3359978827 0.6256086 0.27040412
cca 0.56441343 0.0391047722 0.05667298 0.67544496 0.015730870 0.98442825 1.00000000 0.3196562589 0.6140480 0.25157942
ph 0.30095063 0.0002180982 -0.05132127 0.39946182 0.297331105 0.33599788 0.31965626 1.0000000000 0.4446666 0.08193608
k 0.47246376 0.0788412002 0.11835935 0.62507130 0.109390410 0.62560864 0.61404796 0.4446665609 1.0000000 0.34881524
urr 0.21430294 0.0679783689 0.20079170 0.31241463 0.020510483 0.27040412 0.25157942 0.0819360755 0.3488152 1.00000000
ktv 0.24719233 0.0879913547 0.06463618 0.27081825 0.004094452 0.30377824 0.30664367 0.0767076454 0.2096817 0.08050080
ktv
hgb 0.247192331
tsat 0.087991355
ferr 0.064636183
albumin 0.270818252
pth 0.004094452
ca 0.303778243
cca 0.306643666
ph 0.076707645
k 0.209681668
urr 0.080500799
ktv 1.000000000
library(corrplot)
corrplot(corrs,type = "upper",order = "hclust",
tl.col="black",tl.srt=45)
num_columns <- c("payment","hgb","tsat","ferr","albumin","pth","ca","cca","ph","k","urr","ktv")
corrs<-dci_data_ws %>% select(num_columns) %>%
cor()
corrplot(corrs,type = "upper",order = "hclust",
tl.col="black",tl.srt=45)
NA
NA
tibble('variable' = corrs[1,2:12] %>% names(),'correlation' = corrs[1,2:12]) %>%
ggplot(aes(x=reorder(variable,correlation),y = correlation)) +
geom_point()+
geom_segment(aes(xend=variable,yend=0))+
coord_flip() +
geom_hline(yintercept = 0)
pl <- ggplot(dci_data_ws, aes(x=albumin,y=payment) ) +
geom_point(alpha=0.2) + geom_smooth(method = 'lm') +
scale_x_log10() +
scale_y_log10()
ggplotly(pl)
Transformation introduced infinite values in continuous x-axisTransformation introduced infinite values in continuous x-axisRemoved 195 rows containing non-finite values (stat_smooth).
#unique(dci_data_ws$ktv)
#hgb vs payment
ggplot(dci_data_ws, aes(x=hgb,y=payment) ) +
geom_point() + geom_smooth(method = 'lm') +
scale_x_log10() +
labs(x="hgb", y = "payment", title = "Payment Vs hgb")
NA
NA
ggplot(dci_data_ws, aes(x=ca,y=payment) ) +
geom_point() + geom_smooth(method = 'lm') +
scale_x_log10() +
labs(x="ca", y = "payment", title = "Payment Vs ca")
Maps
#first prep the dataset to get the information on the map markers
payment_summary_by_loc<-dci_data_ws %>%
select(patient_id,location_id,dos_month,dos_year,payment,part_a,part_b_phys,part_b_dme,modality) %>%
group_by(location_id,dos_month,dos_year ) %>%
summarise(
total_patients = NROW(patient_id),
totalpayments = round(sum(payment),digits=2),
min_pay = min(payment),
max_pay = max(payment),
avg_pay = sum(payment)/NROW(patient_id)) %>%
ungroup() %>%
arrange(dos_year,dos_month)
#merge the grouped data with location dataset
payment_geom_summary <- merge(payment_summary_by_loc,locations,by = "location_id")
payment_geom_summary %>%
filter(location_id =='000055' & dos_month ==1 & dos_year ==2017)
#save it to rds file
saveRDS(payment_geom_summary, file = "DCI_midcourse/data/payment_geom_summary.rds")
payment_geom_summary %>%
arrange(desc(totalpayments)) %>%
filter(dos_year==2017 & dos_month == 1)
NA
#Top 5 locations with high average payments
loc_plt<-payment_summary_by_loc %>%
arrange(desc(totalpayments)) %>%
filter(dos_year==2017 & dos_month == 4) %>%
top_n(5) %>%
ggplot(aes(x=location_id,y=avg_pay)) +
geom_col( )
Selecting by avg_pay
ggplotly(loc_plt)
NA
NA
leaflet(data= locations) %>%
addTiles() %>%
addMarkers(~longitude,
~latitude,
popup = ("hello"))
pl <- ggplot(dci_data_ws, aes(x=albumin,y=payment) ) +
geom_point(alpha=0.2) + geom_smooth(method = 'lm') +
scale_x_log10() +
scale_y_log10()
ggplotly(pl)
Transformation introduced infinite values in continuous x-axisTransformation introduced infinite values in continuous x-axisRemoved 195 rows containing non-finite values (stat_smooth).
dci_data_ws %>%
select(location_id,dos_year,dos_month,payment,part_a,part_b) %>%
mutate(percentage_a = round(part_a/payment*100),
percentage_b = round(part_b/payment*100))
NA
NA
NA
dci_data_ws %>%
select(dos_year,dos_month,payment,part_a,part_b) %>%
mutate(percent_part_a = round(part_a/payment*100)) %>%
filter(dos_year=2017,dos_month=1)
dci_data_shiny %>%
select (location_id,patient_id,payment,part_a,part_b)
NA
install.packages("sunburstR")
also installing the dependency ‘d3r’
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/d3r_0.8.7.tgz'
Content type 'application/x-gzip' length 404610 bytes (395 KB)
==================================================
downloaded 395 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.6/sunburstR_2.1.3.tgz'
Content type 'application/x-gzip' length 532381 bytes (519 KB)
==================================================
downloaded 519 KB
The downloaded binary packages are in
/var/folders/v7/gmk316lj6f7_cdbjbr0vvjgw0000gn/T//Rtmplevi5y/downloaded_packages